#!/bin/bash
#SBATCH --job-name=lm-eval
#SBATCH --partition=general
#SBATCH --gres=gpu:a100:2
#SBATCH --cpus-per-task=32
#SBATCH --mem=64G
#SBATCH --time=12:00:00
#SBATCH --output=slurm/%x-%j.out
#SBATCH --error=slurm/%x-%j.err

set +e
set -uo pipefail

: "${MODEL:?MODEL missing (pass with --export=MODEL=...)}"
: "${PYTHON:?PYTHON missing (pass with --export=PYTHON=$(which python))}"
LM_EVAL="$PYTHON -m lm_eval"

# HF env
export HF_HOME=/XXX/huggingface_model/.cache/hf_home
export HF_HUB_CACHE=/XXX/huggingface_model/.cache/hf_hub_cache
export HF_XET_CACHE=/XXX/huggingface_model/.cache/hf_xet_cache
export HF_TOKEN="XXX"   # redacted token
export HF_ALLOW_CODE_EVAL=1
export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_DATASETS_TRUST_REMOTE_CODE=1
export TF_CPP_MIN_LOG_LEVEL=3

mkdir -p slurm output
SAFE_MODEL="${MODEL//[\/ ]/_}"
OUT_DIR="output/"

TASKS_ALL=(mmlu, ifeval, mbpp, bbh)

# vLLM args ONLY (no apply_chat_template here)
MODEL_ARGS="pretrained=$MODEL,tokenizer_mode=auto,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,enforce_eager=False"

echo ">>> MODEL=$MODEL"
echo ">>> PYTHON=$PYTHON"
echo ">>> OUT_DIR=$OUT_DIR"

for task in "${TASKS_ALL[@]}"; do
  echo -e "\n▶▶ $MODEL :: $task"
  $LM_EVAL --model vllm \
    --model_args "$MODEL_ARGS" \
    --tasks "$task" \
    --batch_size 1 \
    --seed 1234 \
    --verbosity INFO \
    --output_path "${OUT_DIR}/${task}" \
    || echo "⚠️  $MODEL :: $task failed; continuing."
done

echo "✅ Done $MODEL"
